knitr::opts_chunk$set(echo = TRUE, cache = TRUE,cache.lazy = FALSE, echo=TRUE)
source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.6 (BiocInstaller 1.28.0), ?biocLite for help
## A new version of Bioconductor is available after installing the most
##   recent version of R; see http://bioconductor.org/install
require("stringr")
## Loading required package: stringr
require("tidyverse")
## Loading required package: tidyverse
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ readr   1.1.1
## ✔ tibble  1.4.2     ✔ purrr   0.2.5
## ✔ tidyr   0.8.1     ✔ dplyr   0.7.6
## ✔ ggplot2 3.0.0     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
require("MASS")
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# require("ggplot2")
# require("dplyr")
# require("magrittr")
require(extrafont)
## Loading required package: extrafont
## Registering fonts with R
loadfonts(quiet = T)

set.seed("20180927")

require(data.table)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
df_fac2chr <- function(df){
  df <- as.data.frame(df)
  for(i in c(1:ncol(df))){
    df[,i] <- as.character(df[,i])
  }
  return(df)
}

df_fac2num <- function(df){
  df <- as.data.frame(df)
  for(i in c(1:ncol(df))){
    df[,i] <- as.numeric(as.character(df[,i]))
  }
  return(df)
}




filter_i <- function(df, cond){
  #cond should be below.
  #boolean vector(T, F, T, F, ...)
  #index vector (1, 3, 5, 2) or (-2, -4, -5)
  #index only 1, 5, -3, -3
  require(dplyr)
  df <- df %>%
    as.data.frame() %>%
    mutate(row_name_ = rownames(.)) %>%
    mutate(hidden_index = c(1:nrow(.))) 

  
  if(is.logical(cond)){
    stopifnot(length(cond) == nrow(df))
    
    cond_pos <- c(1:nrow(df))[cond]
    df <- df %>%
      dplyr::filter(hidden_index %in% cond_pos)
  }else if(is.numeric(cond)){
    sum_bool <- sum(cond > 0)
    stopifnot(sum_bool == 0 | sum_bool == length(cond))
    
    cond <- abs(cond)
    if(sum_bool == length(cond)){
      df <- df %>%
        dplyr::filter(hidden_index %in% cond)
    }else if(sum_bool == 0){
      df <- df %>%
        dplyr::filter(!hidden_index %in% cond)
    }
  }
  
  df <- df %>%
    as.data.frame() %>%
    `rownames<-`(.[["row_name_"]]) %>%
    dplyr::select( -row_name_, -hidden_index)
  
  return(df)
}
target_dis <- "LIHC"
q_value_cuttoff <- 0.0001
#q_value_cuttoff <- 0.01
g2_origin <- fread(file = "./2Garray.annot_2.txt",sep="\t", stringsAsFactors = F)

rna_origin <- fread(file = "./gdac.broadinstitute.org_LIHC.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/LIHC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",sep = "\t", stringsAsFactors = F)

clin_origin <- fread(file = "./clst_clin2.csv", sep = ",", stringsAsFactors = F) %>%
  dplyr::select(2:3) %>%
  as.data.frame()

clin_pick <- read.table(file = "gdac.broadinstitute.org_LIHC.Clinical_Pick_Tier1.Level_4.2016012800.0.0/LIHC.clin.merged.picked.txt", header = T, row.names = 1,sep = "\t",stringsAsFactors = F) %>%
  t() %>%
  df_fac2chr() %>%
  `rownames<-`(toupper(rownames(.)) %>%
                 str_replace_all(., "\\.", "\\-"))
# colnames(rna_origin)[1]

rna <- rna_origin %>%
  filter(!str_detect(.[["Hybridization REF"]], "\\?")) %>%
  `colnames<-`(str_replace_all(colnames(.), "Hybridization REF", "Hybridization.REF")) %>%
  mutate(Hybridization.REF = str_replace_all(.[["Hybridization.REF"]], "\\|", "_")) %>%
  filter_i(cond = -1) %>%
  `rownames<-`(.[["Hybridization.REF"]]) %>%
  dplyr::select(c(1:ncol(.))[str_sub(colnames(.), 14,15) == "01"]) %>%
  `colnames<-`(colnames(.) %>%
                 str_sub(1, 12) %>%
                 str_replace_all("\\.", "-"))

rna_t <- rna %>%
  t() %>%
  df_fac2num()
g2 <- g2_origin %>%
  `colnames<-`(str_replace_all(colnames(.),  "Row.names", "Probe_setID")) %>%
  dplyr::filter(q.value < q_value_cuttoff) %>%
  dplyr::arrange(-avg_sub) %>%
  mutate(gene4match = Gene.Symbol %>%
           str_replace_all( "LOC[0-9]* /// ", "") %>%
           str_replace_all(" /// [-\ /a-zA-Z0-9]*$","")) %>%
  dplyr::filter(gene4match != "---") %>%
  mutate(gene4match = limma::alias2SymbolTable(gene4match)) %>%
  dplyr::filter(!duplicated(gene4match))

A_symbols <- g2$gene4match[g2$gene_cluster =="A"]
B_symbols <- g2$gene4match[g2$gene_cluster =="B"]
# clin_origin %>%
#   as.data.frame() %>%
#   colnames(.) %>%
#   .[str_detect(., "pathology")]

getLimitTag <- function(vec){
  result_vec <- c()
  for(i in 1:length(vec)){
    tag <- vec[i]
    if(tag == "Major_t1"){
      result_vec <- c(result_vec, NA)
    }else if(str_detect(tag, "Major_t[2-4]*")){
      result_vec <- c(result_vec, "Major\nt2~4")
    }else if(str_detect(tag, "Minor")){
      result_vec <- c(result_vec, "Minor")
    }else{
      result_vec <- c(result_vec, NA)
      }
  }
  return(result_vec)
}


clin <- merge(x = clin_origin, y = clin_pick, by.x = 1, by.y = 0) %>%
  mutate(pT = str_remove(pathology_T_stage, "[abc]*$") %>%
           ifelse(.=="tx", NA, .) %>%
             as.factor())  %>%
  mutate(days_to_death = 
           as.numeric(ifelse(is.na(days_to_death), 0, days_to_death))) %>%
  mutate(days_to_last_followup =
           as.numeric(ifelse(is.na(days_to_last_followup), 0, days_to_last_followup))) %>%
  mutate(vital_status = as.numeric(vital_status)) %>%
  mutate(time = pmax(days_to_death, days_to_last_followup)) %>%
  mutate(var = clst_result) %>%
  mutate(clst = ifelse(clst_result ==1, "Major", "Minor")) %>%
  mutate(eval_tag = paste(clst, pT, sep="_")) %>%
  mutate(limit_tag = getLimitTag(eval_tag))


clin_mj_t12 <- clin %>%
  dplyr::filter(pT %in% c("t1", "t2")) %>%
  dplyr::filter(clst =="Major") %>%
  dplyr::mutate(t_col = ifelse(pT =="t1", "red", "yellow")) %>%
  dplyr::arrange(pT)

table(clin_mj_t12$pT)
## 
##  t1  t2  t3  t4 
## 181  89   0   0
ref_sort <- function(sort_vector, ref_vector){
  result_vector <- c()
  for(i in c(1:length(ref_vector))){
    index_i <- which(sort_vector==ref_vector[i])
    result_vector <- c(result_vector, index_i)
  }
  return(result_vector)
}
rna_deg <- rna_t %>%
  as.data.frame() %>%
  .[rownames(.) %in% clin_mj_t12$Row.names,] %>%
  round() %>%
  .[ref_sort(sort_vector = rownames(.), ref_vector = clin_mj_t12$Row.names),] %>%
  t() %>%
  df_fac2num() %>%
  .[apply(., 1, sum) !=0, ] %>%
  .[, ref_sort(sort_vector = colnames(.), ref_vector = clin_mj_t12$Row.names)]

# sum(rownames(rna_deg) == clin_mj_t12$Row.names)
require(DESeq2)
## Loading required package: DESeq2
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
## 
##     first, second
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
## 
##     shift
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
## 
##     apply
colData <- data.frame(condition= as.factor(clin_mj_t12$pT))
e <- DESeqDataSetFromMatrix(countData = rna_deg, colData = colData, design = ~condition)
## converting counts to integer mode
## factor levels were dropped which had no samples
e <- DESeq(e)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 2613 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
result_e <- results(e)
p.val <- result_e$pvalue
p.val[is.na(p.val)] <- 1
q.val <- result_e$padj
q.val[is.na(q.val)] <- 1
foldchange <- result_e$log2FoldChange
ranking <- rank(p.val)

#result_e$log2FoldChange
result_deg <- data.frame(row.names =rownames(rna_deg),
                         p.val,
                         q.val,
                         ranking,
                         foldchange,stringsAsFactors = F)
result_deg <- result_deg %>%
  mutate(symbol = str_replace_all(rownames(.), "_[0-9]*$", "")) %>%
  mutate(rowlabel = NA) %>%
  as.data.frame()

#i <- 3
#is.element(el =  rownames(result_deg)[i],set = B_symbols)
for(i in c(1:nrow(result_deg))){
  if(result_deg[["symbol"]][i] %in% A_symbols){
    result_deg$rowlabel[i] <- "A"
  }else if(result_deg[["symbol"]][i] %in% B_symbols){
    result_deg$rowlabel[i] <- "B"
  }else{
    
  }
}

q_cut <- 0.01
degs <- result_deg %>%
  dplyr::filter(q.val < q_cut)
rna4heat <- rna_deg %>%
  .[str_remove_all(rownames(.), "_[0-9]*$") %in% g2$gene4match,] %>%
  `rownames<-`(str_remove_all(rownames(.), "_[0-9]*$")) %>%
  .[ref_sort(sort_vector = rownames(.), ref_vector = g2$gene4match),] %>%
  .[, ref_sort(sort_vector = colnames(.), ref_vector = clin_mj_t12$Row.names)] %>%
  as.matrix(.)

rna4heat <- log2(rna4heat +1) 

class(rna4heat)
## [1] "matrix"
rowslider <- ifelse(g2$gene_cluster == "A", "red", "blue")[g2$gene4match %in% rownames(rna4heat)]
colslider <- clin_mj_t12$t_col


require(gplots)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
## 
##     space
## The following object is masked from 'package:S4Vectors':
## 
##     space
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(x = rna4heat, Rowv = T, Colv = F, ColSideColors = colslider, RowSideColors = rowslider, dendrogram = "row",trace = "none", col = bluered(256), labCol = F)

rna meta data

rna_meta <- data.frame(name = colnames(rna_origin)) %>%
  filter_i(-1) %>%
  mutate(barcode_patient = str_sub(name, 1, 12)) %>%
  mutate(barcode_sample = str_sub(name, 14, 15)) %>%
  mutate(barcode_vial = str_sub(name, 16, 16)) %>%
  mutate(barcode_portion = str_sub(name, 18, 19)) %>%
  mutate(bcr_portion_barcode = str_sub(name, 1, 19)) %>%
  mutate(barcode_analyte = str_sub(name, 20,20)) %>%
  mutate(barcode_plate = str_sub(name, 22, 25)) %>%
  mutate(barcode_center = str_sub(name, 27, 28)) %>%
  dplyr::filter(barcode_sample == "01")


clin_rna_meta <- merge(clin, rna_meta, by.x=1, by.y = "barcode_patient")
# require(XML)
# require("methods")
biolist <- list.files(path = "./specimen_info",all.files = T,recursive = T,full.names = T,pattern = "\\.xml$")
anotlist <- list.files(path = "./specimen_info",all.files = T,recursive = T,full.names = T,pattern = "annotations\\.txt$")




require(xml2)
## Loading required package: xml2
#require(rvest)
#require(purrr)

get_texVec_chldr <- function(ndset){
  ndset <- xml_children(ndset)
  name_vec <- xml_name(ndset)
  tex_vec <- xml_text(ndset) %>%
    `names<-`(name_vec)
  return(tex_vec)
}



spec_df <- data.frame()
portion_df <- data.frame()
analyte_df <- data.frame()
slide_df <- data.frame()

# i <- 1
# j <- 1
rm(i, j, k, l)
## Warning in rm(i, j, k, l): object 'j' not found
## Warning in rm(i, j, k, l): object 'k' not found
## Warning in rm(i, j, k, l): object 'l' not found
for(i in 1:length(biolist)){
  x <- biolist[i]
  tag <- x %>%
    str_remove("\\.\\/specimen_info\\/") %>%
    str_extract("^[-a-z0-9]*")
  
  #print(tag)
  x_i <- read_xml(x = x)
  ch <- xml_find_all(x = x_i,xpath = ".//bio:sample")
  
  ch[1] %>%
    xml_children() %>%
  xml_name()
  
  #sample loop
  for(j in 1:length(ch)){
    ch_j <- ch[j]
    ch_j
    vec <- get_texVec_chldr(ch_j)
    vec <- c(tag = tag, vec)
    spec_df <- bind_rows(spec_df, vec)

    tag4port <- vec[c("tag","bcr_sample_barcode", "sample_type_id")]
    ports_j <- xml_find_all(x = ch_j, xpath = ".//bio:portion")
    
    #portion loop
    for(k in 1:length(ports_j)){
      vec_port <- ports_j[k] %>% get_texVec_chldr() %>%
        c(tag4port, .)
      portion_df <- bind_rows(portion_df, vec_port)
      #names(vec_port)
      tag4analyte <- vec_port[c("tag","bcr_sample_barcode", "sample_type_id", "bcr_portion_barcode")]
      
      #analyte loog
      anals_k <- xml_find_all(x = ports_j, xpath = ".//bio:analyte")
      slide_k <- xml_find_all(x = ports_j, xpath = ".//bio:slide")
      #anals_k

      for(l in 1:length(anals_k)){
        vec_anal <- get_texVec_chldr(anals_k[l]) %>%
          c(tag4analyte, .)
        
        analyte_df <- bind_rows(analyte_df, vec_anal)
      }
      
      #slide loop
      for(l in 1:length(slide_k)){
        vec_sld <- get_texVec_chldr(slide_k[l]) %>%
          c(tag4analyte, .)
        
        slide_df <- bind_rows(slide_df, vec_sld)
      }

    }
  }
}
spec_df <- spec_df %>%
  df_fac2chr()

DT::datatable(data = spec_df, caption = "specimen_info")
DT::datatable(data = portion_df, caption = "portion_info")
analyte_df <- analyte_df %>%
  mutate(barcode = str_sub(bcr_sample_barcode, 1, 12))
DT::datatable(data = analyte_df, caption = "analyte_info")
slide_df <- slide_df %>%
  mutate(barcode = str_sub(bcr_sample_barcode, 1, 12))
DT::datatable(data = slide_df, caption = "slide_info")
anot_df <- data.frame()
for(i in 1:length(anotlist)){
  ant <- anotlist[i]
  tag <- ant %>%
    str_remove("\\.\\/specimen_info\\/") %>%
    str_extract("^[-a-z0-9]*")
  print(tag)
  
  ant <- read.table(file = anotlist[i],sep = "\t",fill = T,header = T)
  ant <- cbind(ant, tag = tag)
  anot_df <- bind_rows(ant, anot_df)
}
## [1] "08b5366e-bf95-49d0-b01f-21258bb2b179"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 08b5366e-bf95-49d0-b01f-21258bb2b179/annotations.txt'
## [1] "0bccac0a-7a17-407a-b194-68ab4ea15bbf"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 0bccac0a-7a17-407a-b194-68ab4ea15bbf/annotations.txt'
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "214d2a49-8bec-412d-b304-cf73ff0aac89"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 214d2a49-8bec-412d-b304-cf73ff0aac89/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "45c6aac9-3216-4192-9687-a8cd1cbdd400"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 45c6aac9-3216-4192-9687-a8cd1cbdd400/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "4762076f-01b7-4804-847e-b5dd65f56dc0"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 4762076f-01b7-4804-847e-b5dd65f56dc0/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "536aa558-4da8-468d-9fb0-e2c76dbe5c0e"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 536aa558-4da8-468d-9fb0-e2c76dbe5c0e/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "6383248f-42a3-42e2-942e-b512323111f0"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 6383248f-42a3-42e2-942e-b512323111f0/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "71ab84fb-5491-4eef-aeed-7b41480688c5"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 71ab84fb-5491-4eef-aeed-7b41480688c5/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "870d86fc-9e7b-4c54-9f83-5f9eb3a79b2c"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 870d86fc-9e7b-4c54-9f83-5f9eb3a79b2c/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "8d14c291-7a0e-442c-a1b0-e094b578bc25"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## 8d14c291-7a0e-442c-a1b0-e094b578bc25/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "a8a69e33-661b-4607-b279-9e4b8efc6a2a"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## a8a69e33-661b-4607-b279-9e4b8efc6a2a/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "b661f0e3-3bdb-4ef4-b89f-2d84750931d9"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## b661f0e3-3bdb-4ef4-b89f-2d84750931d9/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "ce81ed1c-e10b-4e62-93c3-ad2f25718f6f"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## ce81ed1c-e10b-4e62-93c3-ad2f25718f6f/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "d5471fd1-ea28-4bab-9827-48fdbd129efd"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## d5471fd1-ea28-4bab-9827-48fdbd129efd/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "ea7d662a-154f-4984-90dc-b2072f2497c5"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## ea7d662a-154f-4984-90dc-b2072f2497c5/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "f74bd49a-d0f8-42b4-98de-db3a5a08d049"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## f74bd49a-d0f8-42b4-98de-db3a5a08d049/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## [1] "ff17109c-7482-4bc5-aa01-3989a22d00c6"
## Warning in read.table(file = anotlist[i], sep = "\t", fill = T, header =
## T): incomplete final line found by readTableHeader on './specimen_info/
## ff17109c-7482-4bc5-aa01-3989a22d00c6/annotations.txt'
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
anot_df <- anot_df %>%
  df_fac2chr()
#anotlist

DT::datatable(data = anot_df, caption = "annotation if exist.")
is_nullstr <- function(str){
  if(is.na(str)){
    return(T)
  }else if(str == ""){
    return(T)
  }else if(is.character(str) & str != ""){
    return(F)
  }else{
    return(NA)
  }
}

rm_null_col <- function(df){
  null_ind <- c()
  for(i in 1:ncol(df)){
    sum_null <- sapply(df[,i], is_nullstr) %>%
      sum()
    if(sum_null == nrow(df)){
      null_ind <- c(null_ind, F)
    }else{
      null_ind <- c(null_ind, T)
      }
  }
  df <- df[,null_ind]
  return(df)
}


# is_nullstr(NA)
# 
# sapply(specim[,40], is_nullstr) %>%
#   sum()
# 
# specim[,40][2] %>%
#   class()
# is_nullstr("")
# is_nullstr(1)
specim0 <- spec_df %>%
  dplyr::filter(sample_type_id =="01")
  
specim <- merge(specim0, anot_df, by= "tag", all = T)
  
specim1 <- rm_null_col(specim) %>%
  mutate(barcode = str_sub(bcr_sample_barcode, 1, 12)) %>%
  dplyr::filter(barcode %in% clin$Row.names) 

specim1$barcode[duplicated(specim1$barcode)]
## [1] "TCGA-G3-A3CG" "TCGA-G3-A3CG"
specim1 <- specim1 %>%
  dplyr::filter(!duplicated(.[["barcode"]]))

specim_clin <- merge(specim1, clin, by.x = "barcode", by.y = 1) %>%
  dplyr::filter(!is.na(limit_tag))
outdir <- "./pngdir/"
#dir.create(outdir)



boxjit <- function(data, categ ,gene, log=T){
  print(key)
  
  label_x <- categ
  if(log){
    label_y <- paste("plus 1 and log2 ", gene, sep = "")
  }else{
    label_y <- paste("linear ", gene, sep = "")
  }

  
  log_bool <- ifelse(log, "_logscale", "_linearscale")
  filename <- paste(outdir, categ, gene, log_bool, "_boxjit.png", sep = "")
  data <- data %>%
    mutate_(eval_categ = categ) %>%
    mutate_(eval_gene = gene) %>%
    dplyr::filter(!is.na(eval_categ) & !is.na(eval_gene)) %>%
    dplyr::filter(eval_categ != "" & eval_gene != "") %>%
    mutate(eval_gene = as.numeric(eval_gene)) %>%
    mutate(eval_gene_log = log2(eval_gene +1))
    #dplyr::select(one_of(c("time", "vital_status", "eval_categ", "eval_gene")))
  
  if(log){
      stage2gen1 <- ggplot()+
        theme_classic()+xlab(label_x)+ylab(label_y)+
        geom_boxplot(data = data, aes(x=eval_categ, y=log2(eval_gene+1)), outlier.colour = NA)+
        geom_jitter(data = data, aes(x=eval_categ, y=log2(eval_gene+1)),size =0.5, color ="red", height = 0)
  }else{
      stage2gen1 <- ggplot()+
        theme_classic()+xlab(label_x)+ylab(label_y)+
        geom_boxplot(data = data, aes(x=eval_categ, y=eval_gene), outlier.colour = NA)+
        geom_jitter(data = data, aes(x=eval_categ, y=eval_gene),size =0.5, color ="red", height = 0)
  }


  grid::grid.draw(stage2gen1)
  ggsave(filename = filename, plot = stage2gen1,device = "png",width = 9, height = 9, units = "cm", dpi = 300)
    
  
  label_test <- factor(data[[categ]]) %>%
    levels()
  x <- data[["eval_gene"]][data[[categ]] ==label_test[1]]
  y <- data[["eval_gene"]][data[[categ]] ==label_test[2]]
  
  
  if(length(x) < 5 | length(y) < 5){
    print(length(x))
    print("test avoided")
  }else{
    print("ks.test")
    ks.test(x, y) %>%
      print()
  }

}
categ <- "limit_tag"
key <- "initial_weight"
boxjit(specim_clin, categ = categ, gene = key, log = F)
## [1] "initial_weight"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.19318, p-value = 0.8345
## alternative hypothesis: two-sided
key <- "days_to_collection"
boxjit(specim_clin, categ = categ, gene = key, log = F)
## [1] "days_to_collection"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.21351, p-value = 0.733
## alternative hypothesis: two-sided
setNA2Zero <- function(vec){
  vec0 <- ifelse(is.na(vec), 0, vec)
  return(vec0)
}

setNA2Emp <- function(vec){
  vec0 <- ifelse(is.na(vec), "", vec)
  return(vec0)
}


# clstInts2Strs <- function(vec){
#   int2str <- function(num){
#     if(num ==1){
#       return("Major")
#     }else if(num ==2){
#       return("Minor")
#     }else{
#       return("Unknown")
#     }
#   }
#   
#   strvec <- sapply(vec, int2str)
#   return(strvec)
# }

dat <- specim_clin
clst <- "limit_tag"
key <- "oct_embedded"
# dat[[clst]]
is.na(dat[[clst]])
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
getMat <- function(dat, clst = "limit_tag", key){
  d <- dat %>%
    as.data.frame() %>%
    dplyr::filter(!is.na(clst)) %>%
    dplyr::select(one_of(c(clst, key))) #%>%
    #dplyr::mutate(!!clst := clstInts2Strs(.[[clst]]))
  
  smr <- d %>%
    mutate(count = 1) %>%
    group_by_(clst, key) %>%
    summarise(count = sum(count))
  
  mt <- smr %>%
    tidyr::spread_(key = key, value = "count") %>%
    purrr::map_at(.at = c(2:(ncol(.))), setNA2Zero) %>%
    as.data.frame() %>%
    `rownames<-`(.[[clst]]) %>%
    dplyr::select(-matches(clst)) %>%
    t() 
  #    dplyr::select(-ends_with("NA.")) %>%
  
  return(mt)
}

trimLastNARow <- function(mat){
  lastname <- rownames(mat) %>%
    .[length(.)]
  if(str_detect(lastname, "NA")){
    mat <- mat %>%
      .[c(1:nrow(.) -1),]
  }else{
    print("NA row not detected.")
  }
  return(mat)
}

getChiseqFish <- function(mat){
  print(chisq.test(mat))
  print(fisher.test(mat))
}

checkLab_I <- function(dat, i, get_bool =F){
  key_i <- colnames(dat)[i]
  print(paste("analysing label: ", key_i, sep=""))
  mat_i <- getMat(dat, key = key_i)
  print(htmltools::tagList(
    DT::datatable(data = mat_i, caption = key_i)
  ))
  
  mat_i %>%
    trimLastNARow() %>%
    getChiseqFish()

  if(get_bool){
    return(mat_i)
  }
}

getKeyWord <- function(d, clst = "limit_tag", key){
  cln_ind <- str_detect(colnames(d), "^Row")
  
  clst_ind <- str_detect(colnames(d), clst)
  key_ind <- str_detect(colnames(d), key)
  ind <- colnames(d)[cln_ind | clst_ind | key_ind]
  d <- d %>%
    dplyr::select(one_of(ind))
  return(d)
}

ggsave2 <- function(plot, wid=9, hei=9){
  plot_name <- deparse(substitute(plot))
  file_name <- paste(plot_name, ".png", sep = "",collapse = "")
  ggsave(filename = file_name,plot = plot,device = "png",width = wid, height = hei,dpi = 300,units = "cm")
}
key <- "oct_embedded"
d <- specim_clin %>%
  getKeyWord(key = key) %>%
  mutate_(key = key)

checkLab_I(d, i = 3, get_bool = T)
[1] “analysing label: key”

[1] “NA row not detected.”

## Warning in chisq.test(mat): Chi-squared approximation may be incorrect
Pearson's Chi-squared test with Yates' continuity correction

data: mat X-squared = 0.019516, df = 1, p-value = 0.8889

Fisher's Exact Test for Count Data

data: mat p-value = 0.7358 alternative hypothesis: true odds ratio is not equal to 1 95 percent confidence interval: 0.1778547 3.5737974 sample estimates: odds ratio 0.7351366

  Major\nt2~4 Minor

false 52 4 true 124 7

slide_df <- slide_df %>%
  dplyr::filter(sample_type_id =="01") %>%
  dplyr::filter(bcr_portion_barcode %in% rna_meta$bcr_portion_barcode)

sum(duplicated(slide_df$bcr_portion_barcode))
## [1] 23
slide_df$bcr_portion_barcode %>%
  .[duplicated(.)]
##  [1] "TCGA-BC-A10Z-01A-11" "TCGA-BC-A112-01A-11" "TCGA-BC-A10U-01A-11"
##  [4] "TCGA-BC-4073-01B-02" "TCGA-BC-A110-01A-11" "TCGA-DD-A11D-01A-11"
##  [7] "TCGA-DD-A11C-01A-11" "TCGA-DD-A114-01A-11" "TCGA-DD-A119-01A-11"
## [10] "TCGA-DD-A11B-01A-11" "TCGA-BC-A10Q-01A-11" "TCGA-BC-A10S-01A-22"
## [13] "TCGA-CC-A123-01A-11" "TCGA-DD-A11A-01A-11" "TCGA-BC-A10T-01A-11"
## [16] "TCGA-EP-A12J-01A-11" "TCGA-DD-A113-01A-11" "TCGA-BC-A10W-01A-11"
## [19] "TCGA-DD-A116-01A-11" "TCGA-BC-A10R-01A-11" "TCGA-BC-A10X-01A-11"
## [22] "TCGA-BC-A10Y-01A-11" "TCGA-DD-A115-01A-11"
slide_clin <- merge(slide_df, clin, by.x = "barcode", by.y = 1) %>%
  dplyr::filter(!is.na(limit_tag))

percent_tags <- colnames(slide_clin) %>%
  .[str_detect(., "percent")] %>%
  .[-c(6, 11)]

  
#percent_tags 
for(i in 1:length(percent_tags)){
  key <- percent_tags[i]
  categ <- "limit_tag"

  boxjit(slide_clin, categ = categ, gene = key, log = F)
  
}
## [1] "percent_tumor_cells"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.17611, p-value = 0.8448
## alternative hypothesis: two-sided
## 
## [1] "percent_tumor_nuclei"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.19352, p-value = 0.7523
## alternative hypothesis: two-sided
## 
## [1] "percent_normal_cells"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.045344, p-value = 1
## alternative hypothesis: two-sided
## 
## [1] "percent_necrosis"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.12308, p-value = 0.9928
## alternative hypothesis: two-sided
## 
## [1] "percent_stromal_cells"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.22348, p-value = 0.5777
## alternative hypothesis: two-sided
## 
## [1] "percent_lymphocyte_infiltration"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.21667, p-value = 0.6654
## alternative hypothesis: two-sided
## 
## [1] "percent_monocyte_infiltration"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.13063, p-value = 0.9906
## alternative hypothesis: two-sided
## 
## [1] "percent_granulocyte_infiltration"

## [1] 0
## [1] "test avoided"
## [1] "percent_neutrophil_infiltration"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.1545, p-value = 0.9507
## alternative hypothesis: two-sided
analyte_df <- analyte_df %>%
  dplyr::filter(sample_type_id =="01") %>%
  dplyr::filter(bcr_portion_barcode %in% rna_meta$bcr_portion_barcode) %>%
  dplyr::filter(analyte_type =="RNA")

anal_clin <- merge(analyte_df, clin, by.x = "barcode", by.y = 1) %>%
  dplyr::filter(!is.na(limit_tag))

anal_keys <- c("concentration", "a260_a280_ratio")
for(i in 1:length(anal_keys)){
  key <- anal_keys[i]
  categ <- "limit_tag"

  boxjit(anal_clin, categ = categ, gene = key, log = F)
}
## [1] "concentration"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.11364, p-value = 0.9993
## alternative hypothesis: two-sided
## 
## [1] "a260_a280_ratio"
## [1] "ks.test"
## Warning in ks.test(x, y): cannot compute exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.15909, p-value = 0.9558
## alternative hypothesis: two-sided
check_con <- anal_clin %>%
  dplyr::select(concentration, limit_tag)

check_con %>%
  dplyr::filter(limit_tag =="Minor") %>%
  .[["concentration"]] %>%
  as.numeric() %>%
  summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1500  0.1600  0.1700  0.1664  0.1700  0.1800
# tmp <-  df %>% apply(., 1, function(vec){
#     !is.na(vec[intre_genes[1:3]])
#   }) %>%
#   as.vector()
# 
# tmp <-  df %>% 
#   apply(., 1, function(vec){
#     vec[intre_genes[1:3]] %>%
#       sapply(X = ., FUN = is.na) %>%
#       sum() ==3 %>%
#       return()
#   })
# 
#df <- mg_cla
#df[["patient.histological_type"]]
#rm(df)
# 
# categ <- "limit_tag"
# key <- "initial_weight"
# df <- specim_clin
# getCorFig <- function(df, categ="limit_tag", key, fact_cont, pos_neg, x_log =F){
#   df <- df %>%
#     mutate_(key = key) %>%
#     mutate_(categ = categ) %>%
#     dplyr::select(one_of(c("key", "categ"))) %>%
#     dplyr::filter(!is.na(key)) %>%
#     dplyr::filter(!is.na(categ)) %>%
#     dplyr::mutate(log_key = log2(as.numeric(key) +1))
#     
#   
#   #-1 *df$key
#   
#   if(pos_neg ==F & fact_cont =="cont"){
#     df <- df %>%
#       mutate(key = as.numeric(key)) %>%
#       mutate(key = -1*key) 
#   }else if(pos_neg ==T & fact_cont =="cont"){
#     df <- df %>%
#       mutate(key = as.numeric(key))
#   }else{
#     print("unknown pos_neg bool")
#   }
#   
#   if(x_log == T & fact_cont == "cont"){
#     x_log_str <- "p1log2"
#     df <- df %>%
#       mutate(key = log2(key +1))
#   }else{
#     x_log_str <- "linear"
#   }
#   
#   if(fact_cont =="cont"){
#     
#     scat <- ggplot()+theme_classic()+
#       geom_boxplot(data = df, mapping = aes_string(x = "categ", y ="key"), outlier.color = NA) +
#       geom_jitter(data = df, mapping = aes_string(x = "categ", y ="key"), size = 0.5, color="red") +
#       xlab(paste(x_log_str, categ))+ylab(paste(key, "linear value", sep = ":"))
#     
#     grid::grid.draw(scat)
#     devi <- "png"
#     file_ln <- paste(key, "linear_scat.", devi, sep = "")
#     ggsave(filename = file_ln, plot = scat, device = devi, width = 12, height = 12,units = "cm",dpi = 300)
#     
#     
#     log_scat <- ggplot()+theme_classic()+
#       geom_jitter(data = df_gt, mapping = aes_string(x = "key", y ="log", color = "gene"), size = 0.5)+
#       xlab(paste(x_log_str, key))+ylab("plus1 and log2 expression value")
#     
#     grid::grid.draw(log_scat)
#     file_lg <- paste(key, "p1log2_scat.", devi, sep = "")
#     ggsave(filename = file_lg, plot = log_scat,device = devi, width = 12, height = 12,units = "cm",dpi = 300)
#   }else if(fact_cont =="fact"){
#     df <- df %>%
#       dplyr::filter(!is.na(key)) %>%
#       dplyr::mutate(key = as.factor(key))
#     
#     df_gt <- df %>%
#       tidyr::gather(key = gene, value = gene_exp, one_of(intre_genes)) %>%
#       dplyr::filter(!is.na(gene_exp)) %>%
#       dplyr::mutate(log_gene_exp = log2(gene_exp +1)) %>%
#       .[sample(x = c(1:nrow(.)),size = nrow(.), replace = F),]
#     
#     boxjit <- ggplot()+theme_classic()+
#       geom_boxplot(data = df_gt, mapping = aes_string(x = "key", y="gene_exp", color="gene"), outlier.color = NA) +
#       geom_point(data = df_gt, mapping = aes_string(x ="key", y="log_gene_exp", color ="gene"), size = 0.2, position = position_jitterdodge(jitter.width = 0.2))+
#       xlab(key)
#     grid::grid.draw(boxjit)
#     
#     devi <- "png"
#     file_ln <- paste(key, "linear_boxjit.", devi, sep="")
#     ggsave(filename = file_ln, plot = boxjit, device = devi, width = 12, height = 12,units = "cm",dpi = 300)
#     
#     log_boxjit <- ggplot()+theme_classic()+
#       geom_boxplot(data = df_gt, mapping = aes_string(x = "key", y="log_gene_exp", color="gene"), outlier.color = NA) +
#       geom_point(data = df_gt, mapping = aes_string(x ="key", y="log_gene_exp", color ="gene"), size = 0.2, position = position_jitterdodge(jitter.width = 0.2)) +
#       xlab(key)
#     
#     grid::grid.draw(log_boxjit)  
#     file_lg <- paste(key, "linear_boxjit.", devi, sep="")
#     ggsave(filename = file_lg, plot = boxjit, device = devi, width = 12, height = 12,units = "cm",dpi = 300)
#   }
#   
# }
# 
# key <- "patient.days_to_birth"
# fact_cont <- "cont"
# pos_neg <- F
# 
# getCorFig(df = mg_cla, key = key, fact_cont = fact_cont, pos_neg = pos_neg)